#!/usr/bin/env python3
"""
构建词汇表脚本
用法: python scripts/build_vocab.py
"""
import sys
from pathlib import Path

# 添加项目根目录到Python路径
sys.path.append(str(Path(__file__).parent.parent))

from src.tokenizer import JiaboTokenizer
from src.utils import setup_logging


def main():
    setup_logging("INFO")
    
    corpus_path = "data/corpus_sample.txt"
    vocab_path = "data/vocab.json"
    vocab_size = 32000
    
    print(f"📚 正在从语料构建词汇表...")
    print(f"   语料路径: {corpus_path}")
    print(f"   词汇表大小: {vocab_size}")
    
    tokenizer = JiaboTokenizer.train_from_corpus(
        corpus_path=corpus_path,
        output_path=vocab_path,
        vocab_size=vocab_size,
    )
    
    print(f"✅ 词汇表构建完成！词汇量: {len(tokenizer)}")


if __name__ == "__main__":
    main()
